{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "139ef0f8-0e17-461c-96b2-da1f478a383b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from linearmodels import PanelOLS\n",
    "from linearmodels.panel import PooledOLS\n",
    "import statsmodels.api as sm\n",
    "from pandas.tseries.offsets import MonthEnd\n",
    "cd_data = '.../Data/Daily/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8a5bfbb7-b6c8-402a-b66c-f60e53c6b292",
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Congress tweets daily freq\n",
    "df_firm_daily = pd.read_parquet(cd_data + 'Congress_tweets_daily.parquet')\n",
    "df_firm_daily['date']      = pd.to_datetime(df_firm_daily['date'])\n",
    "df_firm_daily['Relevant'] = 1\n",
    "\n",
    "#  Stock returns daily f req\n",
    "df_returns = pd.read_parquet(cd_data + 'Returns_daily.parquet')\n",
    "df_returns['date'] = pd.to_datetime(df_returns['date']) \n",
    "\n",
    "df = pd.merge(df_returns, df_firm_daily, how = 'left', on =['permno','date'] )\n",
    "df['Relevant'] = np.where(df['Relevant']  == 1, 1, 0)\n",
    "df['tone'] = np.where(df['Relevant']  == 1, df['tone'], 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b44e603-d3d0-414a-93ac-732787647dd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Firm-level controls\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#   News articles concering the target companies\n",
    "df_news_daily = pd.read_parquet(cd_data + 'News_headlines_daily.parquet')\n",
    "df_news_daily['date']      = pd.to_datetime(df_news_daily['date'])\n",
    "df = pd.merge(df, df_news_daily, how = 'left', on =['permno','date'] )\n",
    "df['toneNews'] = df['toneNews'].fillna(0)\n",
    "df['toneNews_relevant'] = np.where(df['Relevant']  == 1, df['toneNews'], 0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Tweets from major news media outlets\n",
    "df_NewsTweets_daily = pd.read_parquet(cd_data + 'News_tweets_daily.parquet')\n",
    "df_NewsTweets_daily['date']      = pd.to_datetime(df_NewsTweets_daily['date'])\n",
    "df = pd.merge(df, df_NewsTweets_daily, how = 'left', on =['permno','date'] )\n",
    "df['toneNewsTweets'] = np.where(df['Relevant']  == 1, df['toneNewsTweets'], 0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Bloomberg sentiment measure\n",
    "df_Bloomberg_daily = pd.read_parquet(cd_data + 'BloombergSentiment_daily.parquet')\n",
    "df_Bloomberg_daily['date']      = pd.to_datetime(df_Bloomberg_daily['date'])\n",
    "df = pd.merge(df, df_Bloomberg_daily, how = 'left', on =['permno','date'] )\n",
    "df['BloombergSentiment'] = np.where(df['Relevant']  == 1, df['BloombergSentiment'], 0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Forecast errors firm and analysts: Revenue and EPS\n",
    "df_FE_daily = pd.read_parquet(cd_data + 'ForecastErrors_daily.parquet')\n",
    "df_FE_daily['date']      = pd.to_datetime(df_FE_daily['date'])\n",
    "df = pd.merge(df, df_FE_daily, how = 'left', on =['permno','date'] )\n",
    "df['FE_SAL'] = np.where(df['Relevant']  == 1, df['FE_SAL'], 0)\n",
    "df['FE_EPS'] = np.where(df['Relevant']  == 1, df['FE_EPS'], 0)\n",
    "df['FE_FIRM_SAL'] = np.where(df['Relevant']  == 1, df['FE_FIRM_SAL'], 0)\n",
    "df['FE_FIRM_EPS'] = np.where(df['Relevant']  == 1, df['FE_FIRM_EPS'], 0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Forecast revisions firm and analysts: Revenue and EPS\n",
    "df_FR_daily = pd.read_parquet(cd_data + 'ForecastRevision_daily.parquet')\n",
    "df_FR_daily['date']      = pd.to_datetime(df_FR_daily['date'])\n",
    "df = pd.merge(df, df_FR_daily, how = 'left', on =['permno','date'] )\n",
    "df['FR_SAL'] = np.where(df['Relevant']  == 1, df['FR_SAL'], 0)\n",
    "df['FR_EPS'] = np.where(df['Relevant']  == 1, df['FR_EPS'], 0)\n",
    "#  Macro-level controls\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Macroeconomic announcements \n",
    "df_macro = pd.read_parquet(cd_data + 'macro_controls_daily.parquet')\n",
    "df_macro['date'] = pd.to_datetime(df_macro['date'])\n",
    "df = pd.merge(df, df_macro[['date','surprise_S']], how = 'left', on =['date'] )  \n",
    "df['surprise_S'] = np.where(df['Relevant']  == 1, df['surprise_S'], 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1b942c6-ea9e-493c-a95d-77980dd5eb17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# regression\n",
    "returns_all = ['ret', 'exret_capm' , 'exret_ff3',  'exret_ff4']\n",
    "exog_var_results = ['a_0','a0_t','a_1','a1_t','b','b_t', '$R$-squared (\\%)','Observations','Firm-level controls','Macro-level controls']\n",
    "all_regs = ['ret', 'exret_capm' , 'exret_ff3',  'exret_ff4']\n",
    "exog_vars = ['Relevant','tone']\n",
    "\n",
    "controls = ['toneNews','toneNewsTweets','BloombergSentiment',\n",
    "           'FE_SAL','FE_EPS','FE_FIRM_SAL','FE_FIRM_EPS',\n",
    "            'ILLIQ','ret_lag','surprise_S']\n",
    "\n",
    "index = ['permno', 'date']\n",
    "coef_results = pd.DataFrame(columns = all_regs, index = exog_var_results )\n",
    "\n",
    "# exog vars\n",
    "for index_reg in returns_all:\n",
    "    index = ['permno', 'date']\n",
    "\n",
    "    var_select = [index_reg]\n",
    "    df_reg = df[var_select + exog_vars + controls + index].dropna().copy()\n",
    "    df_reg = df_reg.set_index(index)    \n",
    "    y =  df_reg[var_select] *100*100 # convert to bps\n",
    "    x = sm.add_constant(df_reg[exog_vars + controls])\n",
    "    mod = PanelOLS(y, x , entity_effects=True)\n",
    "    results = mod.fit(cov_type='clustered', cluster_entity=True,cluster_time = True)\n",
    "\n",
    "    params = results.params\n",
    "    tvalues = results.tstats\n",
    "    r2 = results.rsquared*100\n",
    "    nobs = results.nobs\n",
    "\n",
    "    coef_results[index_reg].loc['a_0'] = params[0]\n",
    "    coef_results[index_reg].loc['a0_t'] = tvalues[0]\n",
    "\n",
    "    coef_results[index_reg].loc['a_1'] = params[1]\n",
    "    coef_results[index_reg].loc['a1_t'] = tvalues[1]\n",
    "\n",
    "    coef_results[index_reg].loc['b'] = params[2]\n",
    "    coef_results[index_reg].loc['b_t'] = tvalues[2]\n",
    "\n",
    "    coef_results[index_reg].loc['$R$-squared (\\%)'] = r2\n",
    "    coef_results[index_reg].loc['Observations'] = nobs\n",
    "\n",
    "    coef_results[index_reg].loc['Firm-level controls'] =  'Yes'\n",
    "    coef_results[index_reg].loc['Macro-level controls'] = 'Yes'\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a73a3de1-cc9a-4918-9b10-67401d57edb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# form deciles based on tone\n",
    "df_firm_daily_alpha = df_firm_daily.copy()\n",
    "def get_previous_quantile(date, df, quantile):\n",
    "    previous_sentiments = df.loc[ (df['date']> date - pd.offsets.BDay(365)) &  (df['date'] < date), 'tone']\n",
    "    return previous_sentiments.quantile(quantile) if not previous_sentiments.empty else None\n",
    "\n",
    "\n",
    "df_firm_daily_alpha['Low'] = df_firm_daily_alpha['date'].apply(lambda x: get_previous_quantile(x, df_firm_daily_alpha,0.10))\n",
    "df_firm_daily_alpha['High'] = df_firm_daily_alpha['date'].apply(lambda x: get_previous_quantile(x, df_firm_daily_alpha, 0.90))\n",
    "\n",
    "df_firm_daily_alpha['bins'] = 0\n",
    "df_firm_daily_alpha['bins'] = np.where(df_firm_daily_alpha['tone']>df_firm_daily_alpha['High'], 1, df_firm_daily_alpha['bins'])\n",
    "df_firm_daily_alpha['bins'] = np.where(df_firm_daily_alpha['tone']<df_firm_daily_alpha['Low'], -1, df_firm_daily_alpha['bins'])\n",
    "\n",
    "# get returns \n",
    "df_returns_alphas = df_returns.copy()\n",
    "df_returns_alphas = df_returns_alphas.sort_values(['permno','date'])\n",
    "\n",
    "df_returns_alphas['lmcap']=df_returns_alphas.groupby(['permno'])['mcap'].shift(1)\n",
    "df_temp =  df_returns_alphas[['permno','date']].copy()\n",
    "\n",
    "# business days around date\n",
    "for i in range(-1, 2):\n",
    "    df_temp[i] = df_temp['date'] + pd.offsets.BDay(i)\n",
    "    \n",
    "df_temp = df_temp.set_index(['permno','date']).stack().reset_index().rename(columns ={'level_2':'nCount','date':'date_test',0:'date'})\n",
    "df_returns_alphas_2 =pd.merge(df_temp,df_returns_alphas, how = 'left', on =['permno','date'])\n",
    "df_returns_alphas_2 = df_returns_alphas_2.sort_values(['permno','date_test','nCount'])\n",
    "df_returns_alphas_2 = df_returns_alphas_2.rename(columns ={'date':'date_port','date_test':'date'})\n",
    "\n",
    "\n",
    "df_daily_port = pd.merge(df_returns_alphas_2, df_firm_daily_alpha[['permno','date','bins']], how = 'left', on =['permno','date'] )\n",
    "df_daily_port = df_daily_port.sort_values(['permno','date','nCount'])\n",
    "\n",
    "\n",
    "n_days = 1\n",
    "def wavg(group, avg_name, weight_name):\n",
    "    d = group[avg_name]\n",
    "    w = group[weight_name]\n",
    "    try:\n",
    "        return (d * w).sum() / w.sum()\n",
    "    except ZeroDivisionError:\n",
    "        return np.nan\n",
    "    \n",
    "\n",
    "# returns relative to the day of the tweet \n",
    "df_daily_alpha = df_daily_port[df_daily_port['nCount']==1].copy() \n",
    "\n",
    "# value weighted returns\n",
    "vwret = df_daily_alpha.groupby(['date','bins']).apply(wavg, 'ret', 'lmcap').to_frame().reset_index().rename(columns={0:'vwret'})\n",
    "vwret_returns = vwret.pivot(index='date', columns='bins', values='vwret')\n",
    "vwret_returns = vwret_returns.rename(columns ={-1:'Short',0:'Neutral',1:'Long'})\n",
    "vwret_returns = vwret_returns.reset_index()\n",
    "\n",
    "# Accumulate the daily returns to the monthly frequency\n",
    "vwret_returns['date'] =vwret_returns['date']  + MonthEnd(0)\n",
    "vwret_returns_monthly = vwret_returns.copy().set_index('date')\n",
    "vwret_returns_monthly = vwret_returns_monthly +1\n",
    "vwret_returns_monthly = vwret_returns_monthly.reset_index()\n",
    "vwret_returns_monthly = vwret_returns_monthly.groupby(['date']).prod()-1\n",
    "vwret_returns_monthly['Long_Short'] = vwret_returns_monthly['Long'] - vwret_returns_monthly['Short'] \n",
    "\n",
    "# Add FF Factors\n",
    "from pandas_datareader.famafrench import get_available_datasets\n",
    "import pandas_datareader.data as web\n",
    "start = '1/1/1920'\n",
    "\n",
    "ds = web.DataReader('F-F_Research_Data_Factors', 'famafrench',start)\n",
    "FF_data  = ds[0].reset_index()\n",
    "FF_data = FF_data.rename(columns = {'Date':'date'}) \n",
    "FF_data['date'] = pd.to_datetime(FF_data['date'].astype(str)  )  +  MonthEnd(0)\n",
    "FF_data['Mkt'] = FF_data['Mkt-RF']  + FF_data['RF'] \n",
    "ff_factors_name  = ['Mkt-RF','Mkt','SMB','HML','RF']\n",
    "FF_data = FF_data[['date'] + ff_factors_name]\n",
    "FF_data[ff_factors_name] = FF_data[ff_factors_name ]/100\n",
    "\n",
    "df_mom  = web.DataReader('F-F_Momentum_Factor', 'famafrench',start)\n",
    "df_mom  = df_mom[0].reset_index()\n",
    "df_mom = df_mom.rename(columns = {'Date':'date'}) \n",
    "df_mom['date'] = pd.to_datetime(df_mom['date'].astype(str)  )  +  MonthEnd(0)\n",
    "df_mom.columns = ['date', 'Mom']\n",
    "df_mom['Mom'] = df_mom['Mom']/100\n",
    "\n",
    "FF_data = pd.merge(FF_data, df_mom, how = 'left', on= 'date')\n",
    "FF_data['date'] = pd.to_datetime(FF_data['date'])\n",
    "FF_data['date'] = FF_data['date']+ MonthEnd(0)\n",
    "\n",
    "\n",
    "vwret_returns_monthly =pd.merge(vwret_returns_monthly,FF_data, how ='left', on= 'date' )\n",
    "df_mean_return = pd.DataFrame(vwret_returns_monthly[['Short','Long','Long_Short']].mean()*100, columns = ['Average return']).T\n",
    "df_stdDev= pd.DataFrame((vwret_returns_monthly[['Short','Long','Long_Short']]*100).std(), columns = ['Standard deviation']).T\n",
    "df_stats = pd.concat([df_mean_return, df_stdDev])\n",
    "\n",
    "# Compute alphas\n",
    "df_alphas = vwret_returns_monthly.copy()\n",
    "df_alphas[['Mkt-RF','SMB','HML','Mom']] =df_alphas[['Mkt-RF','SMB','HML','Mom']]*100\n",
    "df_alphas['Const'] = 1  \n",
    "\n",
    "table_alphas_mkt = pd.DataFrame(columns = ['Short', 'Long','Long_Short'], index = ['alpha_mkt','alpha_mkt_T' ])\n",
    "table_alphas_FF3 = pd.DataFrame(columns = ['Short', 'Long','Long_Short'], index = ['alpha_FF3','alpha_FF3_T' ])\n",
    "table_alphas_FF4 = pd.DataFrame(columns = ['Short', 'Long','Long_Short'], index = ['alpha_FF4','alpha_FF4_T' ])\n",
    "\n",
    "factors_Market  = ['Mkt-RF']\n",
    "factors_FF3  = ['Mkt-RF','SMB','HML']\n",
    "factors_FF4 = ['Mkt-RF','SMB','HML','Mom']\n",
    "\n",
    "for index_Port in ['Short', 'Long']:\n",
    "    # CAPM\n",
    "    factor_select = factors_Market\n",
    "    df_reg  = df_alphas.copy()\n",
    "    df_reg['ret_rf'] = (df_reg[index_Port] - df_reg['RF'])*100\n",
    "    df_reg =  df_reg[['ret_rf'] +  ['Const'] +  factor_select].copy().dropna()\n",
    "    x = df_reg[ ['Const'] +  factor_select]\n",
    "    y = df_reg['ret_rf'] \n",
    "    result = sm.OLS(y, x).fit(cov_type='HAC',cov_kwds={'maxlags':6})\n",
    "    params = result.params\n",
    "    tvalues = result.tvalues\n",
    "    table_alphas_mkt.loc['alpha_mkt'][index_Port] = params[0]\n",
    "    table_alphas_mkt.loc['alpha_mkt_T'][index_Port] = tvalues[0]\n",
    "    \n",
    "    # FF3\n",
    "    factor_select = factors_FF3\n",
    "    df_reg  = df_alphas.copy()    \n",
    "    df_reg['ret_rf'] = (df_reg[index_Port] - df_reg['RF'])*100\n",
    "    df_reg =  df_reg[['ret_rf'] +  ['Const'] +  factor_select].copy().dropna()\n",
    "    x = df_reg[ ['Const'] +  factor_select]\n",
    "    y = df_reg['ret_rf'] \n",
    "    result = sm.OLS(y, x).fit(cov_type='HAC',cov_kwds={'maxlags':6})\n",
    "    params = result.params\n",
    "    tvalues = result.tvalues\n",
    "\n",
    "    table_alphas_FF3.loc['alpha_FF3'][index_Port] = params[0]\n",
    "    table_alphas_FF3.loc['alpha_FF3_T'][index_Port] = tvalues[0]\n",
    "\n",
    "    # FF4    \n",
    "    factor_select = factors_FF4\n",
    "    df_reg  = df_alphas.copy()    \n",
    "    df_reg['ret_rf'] = (df_reg[index_Port] - df_reg['RF'])*100    \n",
    "    df_reg =  df_reg[['ret_rf'] +  ['Const'] +  factor_select].copy().dropna()\n",
    "    x = df_reg[ ['Const'] +  factor_select]\n",
    "    y = df_reg['ret_rf'] \n",
    "    result = sm.OLS(y, x).fit(cov_type='HAC',cov_kwds={'maxlags':6})\n",
    "    params = result.params\n",
    "    tvalues = result.tvalues\n",
    "\n",
    "    table_alphas_FF4.loc['alpha_FF4'][index_Port] = params[0]\n",
    "    table_alphas_FF4.loc['alpha_FF4_T'][index_Port] = tvalues[0]   \n",
    "   \n",
    "\n",
    "# long minus short   \n",
    "# CAPM\n",
    "factor_select = factors_Market\n",
    "df_reg  = df_alphas.copy()\n",
    "df_reg['Long_Short'] = (df_reg['Long_Short'])*100\n",
    "df_reg =  df_reg[['Long_Short'] +  ['Const'] +  factor_select].copy().dropna()\n",
    "x = df_reg[ ['Const'] +  factor_select]\n",
    "y = df_reg['Long_Short'] \n",
    "result = sm.OLS(y, x).fit(cov_type='HAC',cov_kwds={'maxlags':6})\n",
    "params = result.params\n",
    "tvalues = result.tvalues\n",
    "\n",
    "table_alphas_mkt.loc['alpha_mkt']['Long_Short'] = params[0]\n",
    "table_alphas_mkt.loc['alpha_mkt_T']['Long_Short'] = tvalues[0]\n",
    "\n",
    "# FF3\n",
    "factor_select = factors_FF3\n",
    "df_reg  = df_alphas.copy()    \n",
    "df_reg['Long_Short'] = (df_reg['Long_Short'])*100\n",
    "df_reg =  df_reg[['Long_Short'] +  ['Const'] +  factor_select].copy().dropna()\n",
    "x = df_reg[ ['Const'] +  factor_select]\n",
    "y = df_reg['Long_Short'] \n",
    "result = sm.OLS(y, x).fit(cov_type='HAC',cov_kwds={'maxlags':6})\n",
    "params = result.params\n",
    "tvalues = result.tvalues\n",
    "\n",
    "table_alphas_FF3.loc['alpha_FF3']['Long_Short'] = params[0]\n",
    "table_alphas_FF3.loc['alpha_FF3_T']['Long_Short'] = tvalues[0]\n",
    "\n",
    "# FF4    \n",
    "factor_select = factors_FF4\n",
    "df_reg  = df_alphas.copy()    \n",
    "df_reg['Long_Short'] = (df_reg['Long_Short'])*100\n",
    "df_reg =  df_reg[['Long_Short'] +  ['Const'] +  factor_select].copy().dropna()\n",
    "x = df_reg[ ['Const'] +  factor_select]\n",
    "y = df_reg['Long_Short'] \n",
    "result = sm.OLS(y, x).fit(cov_type='HAC',cov_kwds={'maxlags':6})\n",
    "params = result.params\n",
    "tvalues = result.tvalues\n",
    "\n",
    "table_alphas_FF4.loc['alpha_FF4']['Long_Short'] = params[0]\n",
    "table_alphas_FF4.loc['alpha_FF4_T']['Long_Short'] = tvalues[0]   \n",
    "\n",
    "\n",
    "table_alphas = pd.concat([df_stats, table_alphas_mkt,table_alphas_FF3,table_alphas_FF4])      "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
